import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
pio.templates.default = "plotly_white"
df=pd.read_csv("/Users/abelabykuriakose/downloads/credit_scoring.csv")
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1000 entries, 0 to 999 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Age 1000 non-null int64 1 Gender 1000 non-null object 2 Marital Status 1000 non-null object 3 Education Level 1000 non-null object 4 Employment Status 1000 non-null object 5 Credit Utilization Ratio 1000 non-null float64 6 Payment History 1000 non-null float64 7 Number of Credit Accounts 1000 non-null int64 8 Loan Amount 1000 non-null int64 9 Interest Rate 1000 non-null float64 10 Loan Term 1000 non-null int64 11 Type of Loan 1000 non-null object dtypes: float64(3), int64(4), object(5) memory usage: 93.9+ KB
df.head()
| Age | Gender | Marital Status | Education Level | Employment Status | Credit Utilization Ratio | Payment History | Number of Credit Accounts | Loan Amount | Interest Rate | Loan Term | Type of Loan | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 60 | Male | Married | Master | Employed | 0.22 | 2685.0 | 2 | 4675000 | 2.65 | 48 | Personal Loan |
| 1 | 25 | Male | Married | High School | Unemployed | 0.20 | 2371.0 | 9 | 3619000 | 5.19 | 60 | Auto Loan |
| 2 | 30 | Female | Single | Master | Employed | 0.22 | 2771.0 | 6 | 957000 | 2.76 | 12 | Auto Loan |
| 3 | 58 | Female | Married | PhD | Unemployed | 0.12 | 1371.0 | 2 | 4731000 | 6.57 | 60 | Auto Loan |
| 4 | 32 | Male | Married | Bachelor | Self-Employed | 0.99 | 828.0 | 2 | 3289000 | 6.28 | 36 | Personal Loan |
credit_utilization_fig = px.box(df, y='Credit Utilization Ratio',
title='Credit Utilization Ratio Distribution')
credit_utilization_fig.show()
loan_amount_fig = px.histogram(df, x='Loan Amount',
nbins=20,
title='Loan Amount Distribution')
loan_amount_fig.show()
import seaborn as sns
import matplotlib.pyplot as plt
numeric_df = df[['Credit Utilization Ratio',
'Payment History',
'Number of Credit Accounts',
'Loan Amount', 'Interest Rate',
'Loan Term']]
corr=numeric_df.corr()
fig_corr_heatmap=sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.show()
/Applications/anaconda3/lib/python3.8/site-packages/scipy/__init__.py:138: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.24.2)
# Define the mapping for categorical features
education_level_mapping = {'High School': 1, 'Bachelor': 2, 'Master': 3, 'PhD': 4}
employment_status_mapping = {'Unemployed': 0, 'Employed': 1, 'Self-Employed': 2}
# Apply mapping to categorical features
df['Education Level'] = df['Education Level'].map(education_level_mapping)
df['Employment Status'] = df['Employment Status'].map(employment_status_mapping)
# Calculate credit scores using the complete FICO formula
credit_scores = []
for index, row in df.iterrows():
payment_history = row['Payment History']
credit_utilization_ratio = row['Credit Utilization Ratio']
number_of_credit_accounts = row['Number of Credit Accounts']
education_level = row['Education Level']
employment_status = row['Employment Status']
# Apply the FICO formula to calculate the credit score
credit_score = (payment_history * 0.35) + (credit_utilization_ratio * 0.30) + (number_of_credit_accounts * 0.15) + (education_level * 0.10) + (employment_status * 0.10)
credit_scores.append(credit_score)
# Add the credit scores as a new column to the DataFrame
df['Credit Score'] = credit_scores
df
| Age | Gender | Marital Status | Education Level | Employment Status | Credit Utilization Ratio | Payment History | Number of Credit Accounts | Loan Amount | Interest Rate | Loan Term | Type of Loan | Credit Score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 60 | Male | Married | 3 | 1 | 0.22 | 2685.0 | 2 | 4675000 | 2.65 | 48 | Personal Loan | 940.516 |
| 1 | 25 | Male | Married | 1 | 0 | 0.20 | 2371.0 | 9 | 3619000 | 5.19 | 60 | Auto Loan | 831.360 |
| 2 | 30 | Female | Single | 3 | 1 | 0.22 | 2771.0 | 6 | 957000 | 2.76 | 12 | Auto Loan | 971.216 |
| 3 | 58 | Female | Married | 4 | 0 | 0.12 | 1371.0 | 2 | 4731000 | 6.57 | 60 | Auto Loan | 480.586 |
| 4 | 32 | Male | Married | 2 | 2 | 0.99 | 828.0 | 2 | 3289000 | 6.28 | 36 | Personal Loan | 290.797 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 995 | 59 | Male | Divorced | 1 | 1 | 0.74 | 1285.0 | 8 | 3530000 | 12.99 | 48 | Auto Loan | 451.372 |
| 996 | 64 | Male | Divorced | 2 | 0 | 0.77 | 1857.0 | 2 | 1377000 | 18.02 | 60 | Home Loan | 650.681 |
| 997 | 63 | Female | Single | 3 | 2 | 0.18 | 2628.0 | 10 | 2443000 | 18.95 | 12 | Personal Loan | 921.854 |
| 998 | 51 | Female | Married | 4 | 2 | 0.32 | 1142.0 | 3 | 1301000 | 1.80 | 24 | Auto Loan | 400.846 |
| 999 | 37 | Female | Married | 3 | 2 | 0.17 | 1028.0 | 5 | 4182000 | 9.34 | 24 | Auto Loan | 361.101 |
1000 rows × 13 columns
from sklearn.cluster import KMeans
X = df[['Credit Score']]
kmeans = KMeans(n_clusters=4, n_init=10, random_state=42)
kmeans.fit(X)
df['Segment'] = kmeans.labels_
# Convert the 'Segment' column to category data type
df['Segment'] = df['Segment'].astype('category')
# Visualize the segments using Plotly
fig = px.scatter(df, x=df.index, y='Credit Score', color='Segment',
color_discrete_sequence=['green', 'blue', 'yellow', 'red'])
fig.update_layout(
xaxis_title='Customer Index',
yaxis_title='Credit Score',
title='Customer Segmentation based on Credit Scores'
)
fig.show()
df['Segment'] = df['Segment'].map({2: 'Very Low',
0: 'Low',
1: 'Good',
3: "Excellent"})
# Convert the 'Segment' column to category data type
df['Segment'] = df['Segment'].astype('category')
# Visualize the segments using Plotly
fig = px.scatter(df, x=df.index, y='Credit Score', color='Segment',
color_discrete_sequence=['green', 'blue', 'yellow', 'red'])
fig.update_layout(
xaxis_title='Customer Index',
yaxis_title='Credit Score',
title='Customer Segmentation based on Credit Scores'
)
fig.show()